Import libraries and dataset
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
df = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
df = pd.DataFrame(df)
df.head(5)
subjofint = df.loc[(df.rawid == "kirby906a_ax.img")]
print(subjofint)
summary_measures = subjofint[["type", "level","roi","volume"]].groupby(["type", "level","roi"], as_index = False).sum()
type1 = summary_measures.loc[(summary_measures.type == 1)]
print(type1)
Unnamed: 0 rawid roi volume min max \
12540 12541 kirby906a_ax.img Telencephalon_L 467063 2.0 350.0
12541 12542 kirby906a_ax.img Telencephalon_R 470488 2.0 337.0
12542 12543 kirby906a_ax.img Diencephalon_L 8801 60.0 327.0
12543 12544 kirby906a_ax.img Diencephalon_R 9054 63.0 415.0
12544 12545 kirby906a_ax.img Mesencephalon 9564 86.0 352.0
... ... ... ... ... ... ...
13371 13372 kirby906a_ax.img Caudate_tail_L 424 112.0 279.0
13372 13373 kirby906a_ax.img Caudate_tail_R 386 83.0 286.0
13373 13374 kirby906a_ax.img Chroid_LVetc_L 101 56.0 255.0
13374 13375 kirby906a_ax.img Chroid_LVetc_R 84 53.0 271.0
13375 13376 kirby906a_ax.img IV_ventricle 1835 3.0 275.0
mean std type level id icv tbv
12540 165.2599 57.1707 1 1 906 1195015 1123076
12541 171.8695 59.3001 1 1 906 1195015 1123076
12542 227.1878 31.2303 1 1 906 1195015 1123076
12543 231.6770 31.1780 1 1 906 1195015 1123076
12544 269.1003 28.6454 1 1 906 1195015 1123076
... ... ... ... ... ... ... ...
13371 182.8215 31.9975 2 5 906 1195015 1123076
13372 186.3707 37.6639 2 5 906 1195015 1123076
13373 181.6190 39.8132 2 5 906 1195015 1123076
13374 181.9857 43.3901 2 5 906 1195015 1123076
13375 108.3120 53.6962 2 5 906 1195015 1123076
[836 rows x 13 columns]
type level roi volume
0 1 1 CSF 71939
1 1 1 Diencephalon_L 8801
2 1 1 Diencephalon_R 9054
3 1 1 Mesencephalon 9564
4 1 1 Metencephalon 154071
.. ... ... ... ...
486 1 5 subcallosal_ACC_R 484
487 1 5 subgenualWM_ACC_L 218
488 1 5 subgenualWM_ACC_R 106
489 1 5 subgenual_ACC_L 1404
490 1 5 subgenual_ACC_R 1233
[491 rows x 4 columns]
regions = type1.roi[0:11].tolist()
print(regions)
['CSF', 'Diencephalon_L', 'Diencephalon_R', 'Mesencephalon', 'Metencephalon', 'Myelencephalon', 'Telencephalon_L', 'Telencephalon_R', 'BasalForebrain_L', 'BasalForebrain_R', 'CerebralCortex_L']
icv = [sum(subjofint.volume)]
san_valuelist = icv + type1.volume[0:11].tolist()
san_labels = ["icv"] + type1.roi[0:11].tolist()
print(san_valuelist)
print(san_labels)
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = san_labels,
color = "blue"
),
link = dict(
source = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # indices correspond to labels, eg A1, A2, A1, B1, ...
target = [1,2,3,4,5,6,7,8,9,10,11],
value = san_valuelist
))])
fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show(engine = "collab")
[11950461, 71939, 8801, 9054, 9564, 154071, 4035, 467063, 470488, 2686, 2609, 246947] ['icv', 'CSF', 'Diencephalon_L', 'Diencephalon_R', 'Mesencephalon', 'Metencephalon', 'Myelencephalon', 'Telencephalon_L', 'Telencephalon_R', 'BasalForebrain_L', 'BasalForebrain_R', 'CerebralCortex_L']
! jupyter nbconvert --to html hw4.ipynb
[NbConvertApp] Converting notebook hw4.ipynb to html [NbConvertApp] Writing 4291803 bytes to hw4.html
My public webpage
import sqlite3 as sq3
con = sq3.connect("opioid.db")
# cursor() creates an object that can execute functions in the sqlite cursor
sql = con.cursor()
annual = pd.read_sql_query("select * from annual", con)
annual = annual.iloc[: , 1:]
annual[annual['countyfips'].isnull()].loc(['BUYER_STATE'] != 'PR')
# for row in sql.execute("select * from annual where countyfips = 'NA' and BUYER_STATE != 'PR' limit 10;"):
# print(row)
annual.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
Inspect the missing data further on your own. It looks like its the unincorporated territories and a handful of Arkansas values missing countyfips (Federal Information Processing Standard). Specifically, Montgomery county AR is missing FIPs codes. Since we want to look US states in specific, excluding territories, we will just set the Montgomery county ones to the correct value 05097 and ignore the other missing values.
# fix = annual.loc[(annual.BUYER_STATE == 'AR') & (annual.BUYER_COUNTY == 'MONTGOMERY')].assign(countyfips = '05097')
annual.loc[(annual['BUYER_STATE'] == 'AR') & (annual['BUYER_COUNTY'] == 'MONTGOMERY'), 'countyfips'] = '05097'
# ("select * from annual where BUYER_STATE = 'AR' and BUYER_COUNTY = 'MONTGOMERY'", con)
# annual = annual.merge(fix, on = 'BUYER_COUNTY', how = 'left')
annual.head()
#annual.tail()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
Now lets delete rows from the annual table that have missing county data. Check on these counties before and verify that the’ve been deleted afterwards. Also, we want to grab just three columns from the land table, so let’s create a new one called land_area. Also, the column there is called STCOU, which we want to rename to coutyfips. (I’m going to stop printing out the results of every step, so make sure you’re checking your work as you go.)
annual = annual.replace('NA', np.NaN)
annual = annual.dropna(axis = 0, subset='BUYER_COUNTY')
# annual.BUYER_COUNTY.isnull().values.any()
# annual.head()
annual.tail()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 55495 | ZAVALA | TX | 2010 | 248 | 200100 | 48507 |
| 55496 | ZAVALA | TX | 2011 | 406 | 244800 | 48507 |
| 55497 | ZAVALA | TX | 2012 | 473 | 263700 | 48507 |
| 55498 | ZAVALA | TX | 2013 | 399 | 186700 | 48507 |
| 55499 | ZAVALA | TX | 2014 | 162 | 148930 | 48507 |
# sql.execute("create table land_area as select Areaname, STCOU, LND110210D from land;")
#sql.execute("alter table land_area rename column STCOU to countyfips;")
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) Input In [9], in <module> ----> 1 sql.execute("create table land_area as select Areaname, STCOU, LND110210D from land;") 3 sql.execute("alter table land_area rename column STCOU to countyfips;") OperationalError: table land_area already exists
land_area = pd.read_sql_query("select * from land_area", con)
land_area = land_area.assign(countyfips = STCOU)
# you have to close the connection
con.close
land_area.head()
land_area.tail()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [10], in <module> 1 land_area = pd.read_sql_query("select * from land_area", con) ----> 3 land_area = land_area.assign(countyfips = STCOU) 5 # you have to close the connection 6 con.close NameError: name 'STCOU' is not defined
county_info = annual.merge(land_area, on='countyfips', how='left')
county_info.head()
county_info.tail()
Create an interactive scatter plot of average number of opiod pills by year plot using plotly. See the example here. Don't do the intervals (little vertical lines), only the points. Add your plot to an html file with your repo for your Sanky diagram and host it publicly. Put a link to your hosted file in a markdown cell of your hw4.ipynb file. Note, an easy way to create a webpage with this graphic is to export an ipynb as an html file.
county_info = county_info.dropna(axis = 0)
county_info["DOSAGE_UNIT"] = pd.to_numeric(county_info["DOSAGE_UNIT"])
county_info = county_info.assign(Pills_in_millions = county_info.DOSAGE_UNIT/1000000)
state_info = county_info[["BUYER_STATE", "year","BUYER_COUNTY","Pills_in_millions"]].groupby(by=["BUYER_STATE", "year"], as_index = False).mean()
print(state_info)
raw_state_avg = px.scatter(data_frame= state_info, x = "year", y = "Pills_in_millions")
raw_state_avg.show()
! jupyter nbconvert --to html hw4.ipynb